This is our final group assignment for the Data Visualization and Storytelling course at LBS (MAM 2024).
In a world brimming with diverse cultures, economies, and environments, the quest for happiness remains a universal endeavor. Our group’s project embarks on an exploratory journey through the World Happiness Reports from 2015 to 2022. These reports, an amalgam of data, surveys, and research, offer a unique lens to view the changing landscapes of global happiness.
Our github repo is located here: https://github.com/Madikan/am10-mam2024-world-happiness
The database we are using is the World Happiness Report from 2015 to 2022, and can be found here: https://www.kaggle.com/datasets/mayzannilarthein44/world-happiness-report-2015-to-2022/
We also used Global Country Information Dataset 2023 to draw additional insights: https://www.kaggle.com/datasets/nelgiriyewithana/countries-of-the-world-2023
# Import data and clean names
whr2015_2022 <- read.csv(here::here("data/world-happiness-report-2015-2022-cleaned.csv"),
stringsAsFactors = FALSE) %>%
janitor::clean_names()
# Remove the index column
whr2015_2022 <- select(whr2015_2022, -x)
# Replace commas with dots and remove asterisks
whr2015_2022 <- data.frame(lapply(whr2015_2022, function(x) gsub(",", ".", x)))
whr2015_2022 <- data.frame(lapply(whr2015_2022, function(x) gsub("\\*", "", x)))
# Convert columns to numeric
columns_to_convert <- c("happiness_rank", "happiness_score", "economy_gdp_per_capita", "family_social_support", "health_life_expectancy", "freedom", "trust_government_corruption", "generosity", "year")
whr2015_2022[columns_to_convert] <- lapply(whr2015_2022[columns_to_convert], as.numeric)
# Mapping for inconsistent country names
country_name_mapping <- c(
"Taiwan Province of China" = "Taiwan",
"Hong Kong S.A.R. of China" = "Hong Kong",
"Hong Kong S.A.R., China" = "Hong Kong",
"Hong Kong S.A.R.. China" = "Hong Kong",
"Czechia" = "Czech Republic",
"North Macedonia" = "Macedonia",
"Trinidad & Tobago" = "Trinidad and Tobago",
"North Cyprus" = "Northern Cyprus",
"Somaliland region" = "Somalia",
"Somaliland Region" = "Somalia",
"Palestinian Territories" = "Palestine",
"Eswatini. Kingdom of" = "Swaziland")
# Apply the mapping to consolidate country names
whr2015_2022$country <- mapvalues(whr2015_2022$country, from = names(country_name_mapping), to = country_name_mapping)
# Mapping for inconsistent region names
region_name_mapping <- c(
"Eastern Asia" = "East Asia",
"Southeastern Asia" = "Southeast Asia",
"Southern Asia" = "South Asia",
"Middle East and Northern Africa" = "Middle East and North Africa")
# Apply the mapping to consolidate region names
whr2015_2022$region <- mapvalues(whr2015_2022$region, from = names(region_name_mapping), to = region_name_mapping)
# Define the correct region for each country based on the standardized assignments
correct_regions <- c(
"Armenia" = "Central and Eastern Europe",
"Australia" = "Australia and New Zealand",
"Taiwan" = "East Asia",
"Belize" = "Latin America and Caribbean",
"Hong Kong" = "East Asia",
"Somalia" = "Sub-Saharan Africa",
"Namibia" = "Sub-Saharan Africa",
"South Sudan" = "Sub-Saharan Africa",
"Trinidad and Tobago" = "Latin America and Caribbean",
"North Cyprus" = "Western Asia or Europe",
"Macedonia" = "Central and Eastern Europe",
"Gambia" = "Sub-Saharan Africa",
"Luxembourg" = "Western Europe",
"Czech Republic" = "Central and Eastern Europe",
"Guatemala" = "Latin America and Caribbean",
"Kuwait" = "Middle East and North Africa",
"Belarus" = "Central and Eastern Europe",
"Turkmenistan" = "Central and Eastern Europe",
"Libya" = "Middle East and North Africa",
"Azerbaijan" = "Central and Eastern Europe",
"Liberia" = "Sub-Saharan Africa",
"Congo" = "Sub-Saharan Africa",
"Niger" = "Sub-Saharan Africa",
"Comoros" = "Sub-Saharan Africa",
"Palestine" = "Middle East and North Africa",
"Swaziland" = "Sub-Saharan Africa",
"Madagascar" = "Sub-Saharan Africa",
"Chad" = "Sub-Saharan Africa",
"Yemen" = "Middle East and North Africa",
"Mauritania" = "Sub-Saharan Africa",
"Lesotho" = "Sub-Saharan Africa",
"Botswana" = "Sub-Saharan Africa",
"Rwanda" = "Sub-Saharan Africa",
"Canada" = "North America",
"Georgia" = "Central and Eastern Europe",
"Kazakhstan" = "Central and Eastern Europe",
"Kyrgyzstan" = "Central and Eastern Europe",
"Moldova" = "Central and Eastern Europe",
"New Zealand" = "Australia and New Zealand",
"Russia" = "Central and Eastern Europe",
"Tajikistan" = "Central and Eastern Europe",
"Ukraine" = "Central and Eastern Europe",
"United States" = "North America",
"Uzbekistan" = "Central and Eastern Europe",
"Northern Cyprus" = "Western Europe"
)
# Update the region for each country in the dataset
for (country in names(correct_regions)) {
whr2015_2022[whr2015_2022$country == country, "region"] <- correct_regions[country]
}
# View the updated data
glimpse(whr2015_2022)## Rows: 1,229
## Columns: 11
## $ happiness_rank <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…
## $ country <chr> "Switzerland", "Iceland", "Denmark", "Norw…
## $ region <chr> "Western Europe", "Western Europe", "Weste…
## $ happiness_score <dbl> 7.587, 7.561, 7.527, 7.522, 7.427, 7.406, …
## $ economy_gdp_per_capita <dbl> 1.39651, 1.30232, 1.32548, 1.45900, 1.3262…
## $ family_social_support <dbl> 1.34951, 1.40223, 1.36058, 1.33095, 1.3226…
## $ health_life_expectancy <dbl> 0.94143, 0.94784, 0.87464, 0.88521, 0.9056…
## $ freedom <dbl> 0.66557, 0.62877, 0.64938, 0.66973, 0.6329…
## $ trust_government_corruption <dbl> 0.41978, 0.14145, 0.48357, 0.36503, 0.3295…
## $ generosity <dbl> 0.29678, 0.43630, 0.34139, 0.34699, 0.4581…
## $ year <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 2015, …
# Import additional data and clean names
world_data_2023 <- read_csv(here::here("data/world-data-2023.csv")) %>%
janitor::clean_names()
# Create a function that converts percentage strings to numeric values
convert_percent_to_numeric <- function(column) {
if (is.character(column)) {
# Remove the "%" sign and convert to numeric
column <- str_remove_all(column, "%")
column <- as.numeric(str_replace_all(column, ",", "."))
}
column
}
# Apply the conversion function to all columns that end with "_percent"
world_data_2023 <- world_data_2023 %>%
mutate(
across(
.cols = c(ends_with("_percent"), ends_with("_rate")),
.fns = ~convert_percent_to_numeric(.x)
)
) %>%
# Divide columns ending with "_percent" or "_rate" by 100
mutate(
across(
.cols = c(ends_with("_percent"), ends_with("_rate")),
.fns = ~ .x / 100
)
)
# Replace commas with dots and remove asterisks from all character columns
world_data_2023 <- world_data_2023 %>%
mutate(across(where(is.character), ~str_replace_all(.x, ",", ".")))
# Remove asterisks from all character columns (if any)
world_data_2023 <- world_data_2023 %>%
mutate(across(where(is.character), ~str_replace_all(.x, "\\*", "")))
# Create 2022 dataset
whr2022 <- whr2015_2022 %>%
filter(year == 2022)
# Perform the left join
joined_data <- left_join(whr2022, world_data_2023, by = "country")
# Add CO2 emissions and Urban population columns
joined_data <- joined_data %>%
mutate(
co2_emissions_per_capita = case_when(
is.na(co2_emissions) | is.na(population) | population == 0 ~ NA_real_,
TRUE ~ co2_emissions / population
),
urban_population_per_capita = case_when(
is.na(urban_population) | is.na(population) | population == 0 ~ NA_real_,
TRUE ~ urban_population / population
)
)| Name | whr2015_2022 |
| Number of rows | 1229 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| country | 0 | 1 | 4 | 24 | 0 | 165 | 0 |
| region | 0 | 1 | 9 | 28 | 0 | 10 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| happiness_rank | 0 | 1 | 77.47 | 44.47 | 1.0 | 39.00 | 77.00 | 116.00 | 158.00 | ▇▇▇▇▇ |
| happiness_score | 0 | 1 | 5.43 | 1.12 | 2.4 | 4.58 | 5.41 | 6.22 | 7.84 | ▁▅▇▇▃ |
| economy_gdp_per_capita | 0 | 1 | 0.98 | 0.43 | 0.0 | 0.67 | 1.01 | 1.30 | 2.21 | ▃▅▇▃▁ |
| family_social_support | 0 | 1 | 1.03 | 0.33 | 0.0 | 0.83 | 1.07 | 1.27 | 1.64 | ▁▂▆▇▅ |
| health_life_expectancy | 0 | 1 | 0.61 | 0.24 | 0.0 | 0.44 | 0.64 | 0.79 | 1.14 | ▂▅▇▇▂ |
| freedom | 0 | 1 | 0.44 | 0.15 | 0.0 | 0.34 | 0.46 | 0.56 | 0.74 | ▁▃▆▇▃ |
| trust_government_corruption | 0 | 1 | 0.13 | 0.11 | 0.0 | 0.06 | 0.10 | 0.16 | 0.59 | ▇▃▁▁▁ |
| generosity | 0 | 1 | 0.20 | 0.12 | 0.0 | 0.12 | 0.19 | 0.26 | 0.84 | ▇▇▂▁▁ |
| year | 0 | 1 | 2018.45 | 2.28 | 2015.0 | 2016.00 | 2018.00 | 2020.00 | 2022.00 | ▇▃▇▃▇ |
# Create dataframe with data
country_data <- data.frame(
country=whr2015_2022$country,
value=whr2015_2022$happiness_score)
# Define the colors for the low, mid, and high values
low_color <- "#FF9999" # Softer red
mid_color <- "#FFFF99" # Softer yellow
high_color <- "#99CC99" # Softer green
# Create the color palette function
cols <- colorRampPalette(c(low_color, mid_color, high_color))
# Use cols function to generate the number of colors we need
palette_colors <- cols(length(whr2015_2022))
# Countries to Map function
capture.output(n <- invisible(joinCountryData2Map(country_data,
joinCode="NAME",
nameJoinColumn="country")), file='NUL')
# Output plot in pdf
pdf("world_happiness_map.pdf", width = 10, height = 7)
mapCountryData(n,
nameColumnToPlot="value",
mapTitle="Most unhappy countries are located in Sub-Saharan Africa and South-East Asia\nWorld Map for Happiness Score 2015-2022",
colourPalette=palette_colors,
oceanCol = "#F0F8FF",
missingCountryCol = "#CCCCCCCC",
addLegend = TRUE,
aspect = 1.1,
borderCol = "Black",
lwd =.1)
legend("bottom", # Adjust position as needed
legend=c("Low", "Medium", "High"), # Example categories
fill=c(low_color, mid_color, high_color), # Corresponding colors
title="Happiness Score", # Title of the legend
cex=0.8) # Adjust text size as needed
capture.output(dev.off(), file='NUL')
# Output plot in R console
mapCountryData(n,
nameColumnToPlot="value",
mapTitle="Most unhappy countries are located in Sub-Saharan Africa and South-East Asia\nWorld Map for Happiness Score 2015-2022",
colourPalette=palette_colors,
oceanCol = "#F0F8FF",
missingCountryCol = "#CCCCCCCC",
addLegend = TRUE,
aspect = 1.1,
borderCol = "Black",
lwd =.1)
legend("bottom", # Adjust position as needed
legend=c("Low", "Medium", "High"), # Example categories
fill=c(low_color, mid_color, high_color), # Corresponding colors
title="Happiness Score", # Title of the legend
cex=0.8) # Adjust text size as needed# Select top 10 and bottom 10 countries based on happiness score
top10_bottom10_countries <- whr2022 %>%
arrange(desc(happiness_score)) %>%
slice(c(1:10, (n()-9):n()))
# Plotting
ggplot(top10_bottom10_countries) +
geom_point(aes(x = economy_gdp_per_capita,
y = happiness_score,
size = happiness_score,
colour = factor(region),
alpha = 0.85)) +
scale_size_continuous(range = c(2, 15)) +
geom_vline(xintercept = 1.4, colour = "#f7347a", linetype = "longdash") +
geom_hline(yintercept = 5, colour = "#f7347a", linetype = "longdash") +
geom_text(aes(x = economy_gdp_per_capita, y = happiness_score, label = country),
hjust = "left",
vjust = "bottom",
check_overlap = TRUE,
size = 3) +
theme(legend.position = "none") +
labs(title = "There is a high correlation between higher GDP per capita and higher \nhappiness score, but there can also be lower happiness in higher \nGDP per capita countries",
subtitle = "Happiness relative to GDP per capita for Top 5 and Bottom 5 countries in 2022",
x = "GDP per capita",
y = "Happiness score") +
annotate("text", x = 0.83, y = 5.2, family = "Helvetica", size = 2.7, color = "gray20",
label = "Lower GDP per capita") +
annotate("text", x = 1.95, y = 5.2, family = "Helvetica", size = 2.7, color = "gray20",
label = "Higher GDP per capita") +
annotate("text", x = 1.53, y = 2.3, family = "Helvetica", size = 2.7, color = "gray20",
label = "Lower Happiness") +
annotate("text", x = 1.53, y = 8, family = "Helvetica", size = 2.7, color = "gray20",
label = "Higher Happiness")# Getting top 10 countries
whr2015_2022_top10 <- whr2022 %>%
slice_max(happiness_score, n = 10) %>%
mutate(cat = 'top_10',
country_rank = rank(-happiness_score),
country_label = paste0(country, ' (', country_rank, ')'))
# Getting bottom 10 countries
whr2015_2022_bottom10 <- whr2022 %>%
mutate(country_rank = rank(happiness_score),
country_label = paste0(country, ' (', country_rank, ')')) %>%
slice_min(happiness_score, n = 10) %>%
mutate(cat = 'bottom_10')
# Plotting top 10 happiest countries
top_10 <- ggplot(whr2015_2022_top10, aes(x = reorder(country_label, happiness_score))) +
geom_chicklet(aes(y = 10, fill = 4.9), width = 0.5, radius = grid::unit(5, "pt")) +
geom_chicklet(aes(y = happiness_score, fill = happiness_score), width = 0.5, radius = grid::unit(5, "pt")) +
geom_text(aes(y = happiness_score), label = round(whr2015_2022_top10$happiness_score, 2), nudge_y = 0.4, size = 3) +
scale_y_continuous(expand = c(0, 0.1), position = "right", limits = c(0, 10)) +
scale_fill_gradient2(low = 'black', high = '#818aeb', mid = 'white', midpoint = 5) +
coord_flip() +
labs(y="Best possible life = 10", x = '',
title="Top 10 Happiest Countries in 2022",
subtitle="8 of the happiest countries present in Europe",
caption="Source: The World Happiness Report 2022") +
theme_ipsum(grid = '') +
theme(plot.title = element_text(size=15),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(size = 10),
axis.title.x = element_text(size= 10, color = '#555955'),
axis.text.y = element_text(size = 10, color = 'black'),
axis.text.x = element_blank(),
legend.position = 'None')
# Plotting 10 saddest countries
bottom_10 <- ggplot(whr2015_2022_bottom10, aes(x = reorder(country_label, -happiness_score))) +
geom_chicklet(aes(y = 10, fill = 4.9), width = 0.5, radius = grid::unit(5, "pt")) +
geom_chicklet(aes(y = happiness_score, fill = happiness_score), width = 0.5, radius = grid::unit(5, "pt")) +
geom_text(aes(y = happiness_score), label = round(whr2015_2022_bottom10$happiness_score, 2), nudge_y = 0.4, size = 3) +
scale_y_continuous(expand = c(0, 0.1), position = "right", limits = c(0, 10)) +
scale_fill_gradient2(low = '#074040', high = '#4cc2c2', mid = 'white', midpoint = 5) +
coord_flip() +
labs(y="Best possible life = 10", x = '',
title="Top 10 Saddest Countries in 2022",
subtitle="Ordered from saddest to less sad",
caption="Source: The World Happiness Report 2022") +
theme_ipsum(grid = '') +
theme(plot.title = element_text(size=15),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(size = 10),
axis.title.x = element_text(size= 10, color = '#555955'),
axis.text.y = element_text(size = 10, color = 'black'),
axis.text.x = element_blank(),
legend.position = 'None')
# Displaying plots side by side
top_10 + bottom_10# Creating a new variable for sorted regions
whr2022_sorted <- whr2022 %>%
group_by(region) %>%
mutate(avg_happiness = mean(happiness_score)) %>%
ungroup() %>%
mutate(region_sorted = reorder(region, avg_happiness))
# ggplot code
ggplot(whr2022_sorted, aes(x = region_sorted, y = happiness_score, fill = region_sorted)) +
geom_beeswarm(aes(color = region_sorted), size = 2.5, alpha = 0.8) +
geom_boxplot(aes(group = region_sorted), alpha = 0.3, outlier.shape = NA) +
labs(title = "Middle East and North African countries have the highest disparity in \nHappiness score",
subtitle = "Country-wise Happiness Trends in World Regions",
x = "Region",
y = "Happiness Score") +
geom_hline(yintercept = 5, color = "#f7347a", linetype = "longdash") +
theme_classic() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 0, size = 9)) +
scale_x_discrete(labels = scales::wrap_format(10)) +
scale_fill_brewer(palette = "Spectral") +
scale_color_brewer(palette = "Spectral")# ggplotly code
region_level <- ggplot(whr2022_sorted, aes(x = region_sorted, y = happiness_score, fill = region_sorted, text = country)) +
geom_beeswarm(aes(color = region_sorted, alpha = 1)) +
labs(title = "Country-wise happiness trends in world regions",
subtitle = "Middle East and North African countries have the highest disparity in Happiness score",
x = "Region",
y = "Happiness score") +
geom_hline(yintercept = 5, colour = "#f7347a", linetype = "longdash") +
theme_classic() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 0, hjust = 1, size = 8)) +
scale_x_discrete(labels = wrap_format(10)) +
scale_fill_brewer(palette = "Spectral") +
scale_color_brewer(palette = "Spectral") +
geom_boxplot(aes(alpha = 2))
# Convert to ggplotly with tooltips
ggplotly(region_level, tooltip = c("country", "happiness_score"))%>%
layout(title = list(text = paste0('Middle East and North Africa has the highest disparity in Happiness',
'<br>',
'<sup>',
'Country-wise happiness trends in world regions',
'</sup>')))# Calculate the correlation coefficients
cor_co2 <- cor(joined_data$happiness_score, log10(joined_data$co2_emissions_per_capita), use = "complete.obs")
cor_urban <- cor(joined_data$happiness_score, joined_data$urban_population_per_capita, use = "complete.obs")
# Scatter plot for Happiness Score vs CO2 Emissions Per Capita (Log Scale)
ggplot(joined_data, aes(x = co2_emissions_per_capita, y = happiness_score, size = co2_emissions)) +
geom_point(alpha = 0.5) + # Points with transparency
geom_smooth(method = "lm", se = TRUE, show.legend = FALSE) + # Linear model fit
scale_x_log10() + # Log scale for x-axis
labs(
title = "Higher CO2 Emissions Per Capita is correlated with Higher Happiness",
subtitle = paste("Happiness Score relative to CO2 Emissions Per Capita (Log Scale), r =", round(cor_co2, 2)),
x = "CO2 Emissions Per Capita (Log Scale)",
y = "Happiness Score",
size = "CO2 Emissions"
) +
theme_classic()# Calculate correlation coefficients
cor_life_expectancy <- cor(joined_data$happiness_score, joined_data$life_expectancy, use = "complete.obs")
# Create a scatter plot for Happiness Score vs Life Expectancy
ggplot(joined_data, aes(x = life_expectancy, y = happiness_score)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "Longer Life Expectancy is correlated with Higher Happiness",
subtitle = paste("Happiness Score relative to Life Expectancy, r =", round(cor_life_expectancy, 2)),
x = "Life Expectancy",
y = "Happiness Score"
) +
theme_classic()# Population density plot
ggplot(joined_data, aes(x = density_p_km2, y = happiness_score, size = urban_population)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "Population Density is inconclusively linked to Happiness",
subtitle = paste("Happiness Score relative to Population Density, r =", round(cor_urban, 2)),
x = "Population density",
y = "Happiness Score") +
theme_classic() +
theme(legend.key.size = unit(0.5, "lines"))+
xlim(0, 500)+
theme(legend.position = "none")# Scatter plot for Happiness Score vs Urban Population Per Capita
ggplot(joined_data, aes(x = urban_population_per_capita, y = happiness_score, size = urban_population)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = TRUE, show.legend = FALSE) +
labs(
title = "Urban Population Per Capita is positively correlated to Happiness",
subtitle = paste("Happiness Score relative to Urban Population Per Capita, r =", round(cor_urban, 2)),
x = "Urban Population Per Capita",
y = "Happiness Score",
size = "Absolute Urban Population"
) +
theme_classic() +
theme(legend.key.size = unit(0.5, "lines"))# Aggregate data by year
yearly_data <- aggregate(happiness_score ~ year, whr2015_2022, mean)
# Convert to a time series object
ts_data <- ts(yearly_data$happiness_score, start = min(yearly_data$year), end = max(yearly_data$year), frequency = 1)
# Fit an ARIMA model
fit <- auto.arima(ts_data)
# Forecast for the next 2 years (2023 and 2024)
forecasted_values <- forecast(fit, h = 2)
# Convert the forecast object to a data frame
forecast_df <- data.frame(
year = c(yearly_data$year, (max(yearly_data$year) + 1):(max(yearly_data$year) + 2)),
value = c(fitted(fit), forecasted_values$mean),
lower = c(rep(NA, length(fitted(fit))), forecasted_values$lower[, "80%"]),
upper = c(rep(NA, length(fitted(fit))), forecasted_values$upper[, "80%"])
)
# Create the plot with ggplot2
ggplot(forecast_df, aes(x = year, y = value)) +
geom_line(size = 1, color = "darkblue") +
geom_point(color = "darkblue") +
geom_line(size = 1, data = forecast_df[length(yearly_data$year) + 1:nrow(forecast_df), ], color = "darkgrey") +
geom_point(data = forecast_df[length(yearly_data$year) + 1:nrow(forecast_df), ], color = "grey") +
geom_ribbon(aes(ymin = lower, ymax = upper), fill = "grey", alpha = 0.2) +
scale_x_continuous(breaks = forecast_df$year) +
labs(title = "The overall worldwide happiness score increased from 2015 to 2022",
subtitle = "Worldwide happiness score fluctuations over the years",
x = "Year", y = "Happiness Score") +
theme_minimal() +
theme(plot.title = element_text(size = 14),
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title.position = "plot")# Aggregate data by region and year
region_yearly_data <- whr2015_2022 %>%
dplyr::group_by(region, year) %>%
dplyr::summarize(avg_happiness_score = mean(happiness_score, na.rm = TRUE)) %>%
ungroup()
# Calculate overall average happiness score for each region
region_avg_scores <- region_yearly_data %>%
dplyr::group_by(region) %>%
dplyr::summarize(overall_avg_happiness = mean(avg_happiness_score)) %>%
arrange(desc(overall_avg_happiness)) %>%
ungroup()
# Create a color mapping based on these averages using the "Spectral" palette
num_regions <- nrow(region_avg_scores)
spectral_colors <- colorRampPalette(brewer.pal(11, "Spectral"))(num_regions)
names(spectral_colors) <- region_avg_scores$region
# Adjusting the order of the 'region' factor according to happiness score
region_yearly_data$region <- factor(region_yearly_data$region, levels = region_avg_scores$region)
# Create the trend plot with custom colors
ggplot(region_yearly_data, aes(x = year, y = avg_happiness_score, group = region, color = region)) +
geom_line(size = 1) +
geom_point() +
scale_color_manual(values = spectral_colors) +
theme_minimal() +
labs(title = "The happiness score for each region remained relatively stable \nfrom 2015 to 2022",
subtitle = "Change in happiness by region over time",
x = "Year",
y = "Average Happiness Score",
color = "Region") # Melting data for stacked bar plot
melted_whr <- melt(whr2015_2022, id.vars = c("year", "country"),
measure.vars = c("economy_gdp_per_capita", "family_social_support",
"health_life_expectancy", "freedom",
"trust_government_corruption", "generosity"))
# Determine the number of unique factors (variables) for the color palette
num_factors <- length(unique(melted_whr$variable))
# Create a color mapping using the "Spectral" palette
spectral_colors <- colorRampPalette(brewer.pal(11, "Spectral"))(num_factors)
# Define custom names for the legends
legend_names <- c("Economy (GDP per Capita)", "Family/Social Support", "Health (Life Expectancy)",
"Freedom", "Trust (Government Corruption)", "Generosity")
# Stacked bar chart with "Spectral" color set and renamed legends
ggplot(melted_whr, aes(x = as.factor(year), y = value, fill = variable)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = spectral_colors, labels = legend_names) + # Apply "Spectral" colors with custom legend names
theme_minimal() +
labs(title = "Economy and social support seem to be the two main happiness contributors",
subtitle = "Contribution of happiness factors over the years",
x = "Year", y = "Factor Contribution",
fill = "Happiness Factor")+
theme(plot.title = element_text(size = 13),
plot.title.position = "plot")# Density ridge plot
ggplot(whr2015_2022, aes(x = happiness_score, y = factor(year), fill = region)) +
geom_density_ridges(scale = 1) +
labs(title = "Western Europe always ranks first while Sub-Saharan Africa and \nSouth Asia report lower scores", x = "Happiness Score", y = "Year",
subtitle = "Distribution of Happiness Scores of Different Countries by Region and Year",
fill = "Region") +
scale_fill_viridis_d() +
theme(
panel.background = element_blank(),
axis.text.y = element_text(hjust = 1),
axis.title.x = element_text(vjust = 0.5, hjust = 0.5),
axis.title.y = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) # 2D Density plot
ggplot(whr2015_2022, aes(x = economy_gdp_per_capita, y = happiness_score)) +
geom_density_2d_filled() +
labs(title = "A positive correlation exists with the highest density observed \nat mid-range values",
subtitle = "2D Density Plot of GDP per Capita relative to Happiness Score",
caption = "Note: Density levels represent the concentration of countries at GDP and happiness score values.",
x="GDP per capita",
y="Happiness Score",
fill = "Density Range") +
theme_minimal() +
theme(
plot.caption = element_text(face = "italic", color = "grey70",hjust = 0))# Calculate mean life expectancy for each region
means <- whr2015_2022 %>%
group_by(region) %>%
summarise(mean_life_expectancy = mean(health_life_expectancy, na.rm = TRUE)) %>%
arrange(desc(mean_life_expectancy))
# Create a gradient of 10 pastel colors
colors <- colorRampPalette(c("#B3FFB3", "#FFE0B3", "#FFB3B3"))(10)
# Map these colors to the regions based on their mean life expectancy
color_mapping <- setNames(colors, means$region)
# Create the violin plot
ggplot(whr2015_2022, aes(x = reorder(region, happiness_score, FUN = mean), y = health_life_expectancy, fill = region)) +
geom_violin(trim = FALSE) +
geom_boxplot(width = 0.1, fill = "white") +
scale_fill_manual(values = color_mapping) +
labs(title = "Higher life expectancy and uniform distributions are in regions \nwith higher happiness scores",
subtitle = "Violin Plot of Life Expectancy by Region",
x = "Region",
y = "Life expectancy") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.margin = margin(l = 40, unit = "pt"),
legend.position = "none"
)# Categories trust in multiple bins
categorise_trust <- whr2015_2022 %>%
mutate(trust_category = case_when(
trust_government_corruption >= 0.0 & trust_government_corruption <= 0.1 ~ "Very Low",
trust_government_corruption > 0.1 & trust_government_corruption <= 0.2 ~ "Low",
trust_government_corruption > 0.2 & trust_government_corruption <= 0.3 ~ "Medium",
trust_government_corruption > 0.3 & trust_government_corruption <= 0.4 ~ "High",
trust_government_corruption > 0.4 ~ "Very High",
TRUE ~ NA_character_
)) %>%
filter(!is.na(trust_category)) %>%
mutate(trust_category = factor(trust_category, levels = c("Very Low", "Low", "Medium", "High", "Very High")))
ggplot(categorise_trust, aes(x = trust_category, y = happiness_score, color = trust_category)) +
geom_point(size=2) +
labs(title = "Countries with high trust in government seem to be mostly happy, \nwhile there is more disparity with lower levels of trust",
subtitle= "Happiness Score relative to Trust in Government",
x = "Trust in Government",
y = "Happiness Score") +
theme_minimal() +
scale_color_manual(values = c(
"Very Low" = "#FFB3B3", # Pastel red for Very Low trust
"Low" = "#FFD1B3", # Pastel orange for Low trust
"Medium" = "#FFE0B3", # Lighter orange for Medium trust
"High" = "#D9FFB3", # Pale green for High trust
"Very High" = "#B3FFB3" # Pastel green for Very High trust
)) +
theme(legend.position = "none")